{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "299e87fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3 install bert-tensorflow==1.0.1\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/corpus/goemotions/goemotions_1.translated.csv\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/corpus/goemotions/goemotions_2.translated.csv\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/corpus/goemotions/goemotions_3.translated.csv\n",
    "# !wget https://f000.backblazeb2.com/file/malaya-model/pretrained/fastformer-base-social-media-2021-11-02.tar.gz\n",
    "# !tar -zxf fastformer-base-social-media-2021-11-02.tar.gz\n",
    "# !rm fastformer-base-social-media-2021-11-02.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c8ad7380",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "goemotions_3.translated.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 71225/71225 [00:07<00:00, 9574.98it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "goemotions_1.translated.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 70000/70000 [00:07<00:00, 9644.32it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "goemotions_2.translated.csv\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 70000/70000 [00:07<00:00, 9582.44it/s]\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "\n",
    "texts, labels = [], []\n",
    "\n",
    "files = glob('goemotions_*.translated.csv')\n",
    "for file in files:\n",
    "    print(file)\n",
    "    df = pd.read_csv(file)\n",
    "    for i in tqdm(range(len(df))):\n",
    "        row = df.iloc[i]\n",
    "        label = row.iloc[9:-2].tolist()\n",
    "        t = row.iloc[[0, -2, -1]].tolist()\n",
    "        texts.extend(t)\n",
    "        labels.extend([label] * len(t))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d957d3ae",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(633675, 633675)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(texts), len(labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e452eb96",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "from unidecode import unidecode\n",
    "import re\n",
    "\n",
    "normalized_chars = {}\n",
    "\n",
    "chars = '‒–―‐—━—-▬'\n",
    "for char in chars:\n",
    "    normalized_chars[ord(char)] = '-'\n",
    "\n",
    "chars = '«»“”¨\"'\n",
    "for char in chars:\n",
    "    normalized_chars[ord(char)] = '\"'\n",
    "\n",
    "chars = \"’'ʻˈ´`′‘’\\x92\"\n",
    "for char in chars:\n",
    "    normalized_chars[ord(char)] = \"'\"\n",
    "\n",
    "chars = '̲_'\n",
    "for char in chars:\n",
    "    normalized_chars[ord(char)] = '_'\n",
    "\n",
    "chars = '\\xad\\x7f'\n",
    "for char in chars:\n",
    "    normalized_chars[ord(char)] = ''\n",
    "\n",
    "chars = '\\n\\r\\t\\u200b\\x96'\n",
    "for char in chars:\n",
    "    normalized_chars[ord(char)] = ' '\n",
    "\n",
    "    \n",
    "laughing = {\n",
    "    'huhu',\n",
    "    'haha',\n",
    "    'gagaga',\n",
    "    'hihi',\n",
    "    'wkawka',\n",
    "    'wkwk',\n",
    "    'kiki',\n",
    "    'keke',\n",
    "    'huehue',\n",
    "    'hshs',\n",
    "    'hoho',\n",
    "    'hewhew',\n",
    "    'uwu',\n",
    "    'sksk',\n",
    "    'ksks',\n",
    "    'gituu',\n",
    "    'gitu',\n",
    "    'mmeeooww',\n",
    "    'meow',\n",
    "    'alhamdulillah',\n",
    "    'muah',\n",
    "    'mmuahh',\n",
    "    'hehe',\n",
    "    'salamramadhan',\n",
    "    'happywomensday',\n",
    "    'jahagaha',\n",
    "    'ahakss',\n",
    "    'ahksk'\n",
    "}\n",
    "\n",
    "def make_cleaning(s, c_dict):\n",
    "    s = s.translate(c_dict)\n",
    "    return s\n",
    "\n",
    "def cleaning(string):\n",
    "    \"\"\"\n",
    "    use by any transformer model before tokenization\n",
    "    \"\"\"\n",
    "    string = unidecode(string)\n",
    "    \n",
    "    string = ' '.join(\n",
    "        [make_cleaning(w, normalized_chars) for w in string.split()]\n",
    "    )\n",
    "    string = re.sub('\\(dot\\)', '.', string)\n",
    "    string = (\n",
    "        re.sub(re.findall(r'\\<a(.*?)\\>', string)[0], '', string)\n",
    "        if (len(re.findall(r'\\<a (.*?)\\>', string)) > 0)\n",
    "        and ('href' in re.findall(r'\\<a (.*?)\\>', string)[0])\n",
    "        else string\n",
    "    )\n",
    "    string = re.sub(\n",
    "        r'\\w+:\\/{2}[\\d\\w-]+(\\.[\\d\\w-]+)*(?:(?:\\/[^\\s/]*))*', ' ', string\n",
    "    )\n",
    "    \n",
    "    chars = '.,/'\n",
    "    for c in chars:\n",
    "        string = string.replace(c, f' {c} ')\n",
    "        \n",
    "    string = re.sub(r'[ ]+', ' ', string).strip().split()\n",
    "    string = [w for w in string if w[0] != '@']\n",
    "    x = []\n",
    "    for word in string:\n",
    "        word = word.lower()\n",
    "        if any([laugh in word for laugh in laughing]):\n",
    "            if random.random() >= 0.5:\n",
    "                x.append(word)\n",
    "        else:\n",
    "            x.append(word)\n",
    "    string = [w.title() if w[0].isupper() else w for w in x]\n",
    "    return ' '.join(string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4cd6a9e9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 633675/633675 [00:16<00:00, 38303.55it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(633633, 633633)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X, Y = [], []\n",
    "for i in tqdm(range(len(texts))):\n",
    "    try:\n",
    "        c = cleaning(texts[i])\n",
    "        if len(c) > 2:\n",
    "            X.append(c)\n",
    "            Y.append(labels[i])\n",
    "    except:\n",
    "        pass\n",
    "    \n",
    "len(X), len(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "99e5eddc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2021-11-17 12:56:56.535319: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n",
      "/home/ubuntu/.local/lib/python3.8/site-packages/tensorflow_addons/utils/resource_loader.py:78: UserWarning: You are currently using TensorFlow 2.5.0 and trying to load a custom op (custom_ops/seq2seq/_beam_search_ops.so).\n",
      "TensorFlow Addons has compiled its custom ops against TensorFlow 2.6.0, and there are no compatibility guarantees between the two versions. \n",
      "This means that you might get segfaults when loading the custom op, or other kind of low-level errors.\n",
      " If you do, do not file an issue on Github. This is a known limitation.\n",
      "\n",
      "It might help you to fallback to pure Python ops by setting environment variable `TF_ADDONS_PY_OPS=1` or using `tfa.options.disable_custom_kernel()` in your code. To do that, see https://github.com/tensorflow/addons#gpucpu-custom-ops \n",
      "\n",
      "You can also change the TensorFlow version installed on your system. You would need a TensorFlow version equal to or above 2.6.0 and strictly below 2.7.0.\n",
      " Note that nightly versions of TensorFlow, as well as non-pip TensorFlow like `conda install tensorflow` or compiled from source are not supported.\n",
      "\n",
      "The last solution is to find the TensorFlow Addons version that has custom ops compatible with the TensorFlow installed on your system. To do that, refer to the readme: https://github.com/tensorflow/addons\n",
      "  warnings.warn(\n",
      "/home/ubuntu/.local/lib/python3.8/site-packages/malaya_boilerplate/frozen_graph.py:24: UserWarning: Cannot import beam_search_ops from Tensorflow Addons, `deep_model` for stemmer will not available to use, make sure Tensorflow Addons version >= 0.12.0\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from malaya.text.bpe import WordPieceTokenizer\n",
    "\n",
    "tokenizer = WordPieceTokenizer('BERT.wordpiece', do_lower_case = False)\n",
    "# tokenizer.tokenize('halo nama sayacomel')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "14a8f2c2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 633633/633633 [00:52<00:00, 12116.98it/s]\n"
     ]
    }
   ],
   "source": [
    "input_ids, input_masks = [], []\n",
    "actual_l = []\n",
    "\n",
    "maxlen = 200\n",
    "for i in tqdm(range(len(X))):\n",
    "    text = X[i]\n",
    "    tokens_a = tokenizer.tokenize(text)\n",
    "    tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"]\n",
    "    input_id = tokenizer.convert_tokens_to_ids(tokens)\n",
    "    input_mask = [1] * len(input_id)\n",
    "    \n",
    "    if len(input_id) <= maxlen:\n",
    "        input_ids.append(input_id)\n",
    "        input_masks.append(input_mask)\n",
    "        actual_l.append(Y[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "1ced383d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(633612, 633612)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(input_ids), len(actual_l)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "41b9bc28",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open('goemotions-fastformer.pkl', 'wb') as fopen:\n",
    "    pickle.dump([input_ids, actual_l], fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
